
rm(list = ls())

library(tidyverse)
library(readxl)
library(lubridate)

region_raw <- read_excel("k_RegionDataRaw.xlsx")



region_dta_temp <- region_raw %>%
  select(start_time = V8, end_time = V9, randomid = RandomID,
         matches("S1|S2", ignore.case = FALSE),
         matches("G1|G2|G3", ignore.case = FALSE),
         contains("lottery"), contains("PSM"), contains("big5"),
         matches("job_sect|job_pref|job_likely|job_qualities"),
         bribe_nonexp = dishonesty_4,
         age, male = gender, class_yr,
         dep = fakultet,
         gpa = GPA, olympiad, contains("EGE"), region,
         city_size, contains("religion"), family_income,
         contains("relatives"), 
         soc_science_knowledg, device, 
         Q119, Q120, Q121, Q122_6
  )


#OTHER PRELIMINARY CLEANING

region_dta_temp <- region_dta_temp %>% mutate_at(vars(contains("_work_")), list(~replace_na(., 0)))



#CREATING DATA SET

region_dta <- region_dta_temp %>% transmute(
                                #DICTATOR GAME
                                donate = G1_donate_1, 
                                #converts to ruble amount
                                donate25 = donate*25,
                                #RISK AVERSION MEASURE
                                risk_averse1 = 8 - (lottery_1 + lottery_2 + lottery_3 + lottery_4 + lottery_5 + lottery_6 + lottery_7),
                                #this version removes subjects with non-transitive preferences
                                risk_averse2 = ifelse((lottery_1 <= lottery_2 & 
                                                         lottery_2 <= lottery_3 & 
                                                         lottery_3 <= lottery_4 &
                                                         lottery_4 <= lottery_5 &
                                                         lottery_5 <= lottery_6 & 
                                                         lottery_6 <= lottery_7) == TRUE, risk_averse1, NA),
                                #CAREER PREFERENCE VARIABLES
                                job_sect,
                                fed_pref = job_pref_1,
                                reg_pref = job_pref_2,
                                corp_pref = job_pref_3,
                                sme_pref = job_pref_4,
                                owner_pref = job_pref_5,
                                finance_pref = job_pref_6,
                                consult_pref = job_pref_7,
                                ngo_pref = job_pref_8,
                                gov_pref = job_pref_9,
                                fed_exp = job_likely_1,
                                reg_exp = job_likely_2,
                                corp_exp = job_likely_3,
                                sme_exp = job_likely_4,
                                owner_exp = job_likely_5,
                                finance_exp = job_likely_6,
                                consult_exp = job_likely_7,
                                ngo_exp = job_likely_8,
                                gov_exp = job_likely_9,
                                pub_avg = (fed_pref + reg_pref + gov_pref)/3,
                                priv_avg = (corp_pref + finance_pref + sme_pref + consult_pref + owner_pref)/5,
                                #JOB ATTRIBUTES
                                job_stable = job_qualities_1,
                                job_income = job_qualities_2,
                                job_bens = job_qualities_3,
                                job_promo = job_qualities_4,
                                job_interest = job_qualities_5,
                                job_altruism = job_qualities_6,
                                job_social = job_qualities_7,
                                job_schedule = job_qualities_8,
                                job_connect = job_qualities_9,
                                job_prestige = job_qualities_10,
                                job_intrinsic = (job_social + job_altruism + job_interest)/3,
                                job_extrinsic = (job_income + job_connect + job_prestige)/3,
                                job_pragmatic = (job_stable + job_promo + job_bens + job_schedule)/4,
                                #PSM
                                aps1 = PSM_1, aps2 = PSM_2, aps3 = PSM_3, aps4 = PSM_4,
                                cpv1 = PSM_5, cpv2 = PSM_6, cpv3 = PSM_7, cpv4 = PSM_8,
                                com1 = PSM_9, com2 = PSM_10, com3 = PSM_11, com4 = PSM_12,
                                ss1 = PSM_13, ss2 = PSM_14, ss3 = PSM_15, ss4 = PSM_16,
                                aps = (aps1 + aps2 + aps3 + aps4)/4,
                                cpv = (cpv1 + cpv2 + cpv3 + cpv4)/4,
                                com = (com1 + com2 + com3 + com4)/4,
                                ss = (ss1 + ss2 + ss3 + ss4)/4,
                                psm = (aps + cpv + com + ss)/4,
                                #CONTROL VARIABLES
                                age, male, olympiad, gpa, family_income,
                                city_size, religion, religion_services,
                                religion_denom = recode(religion_denom, `1` = "Orthodox Christianity", `2` =  "Protestantism",
                                                        `3` = "Catholicism", `4` = "Islam", `5` = "Judaism", `6` = "Buddhism",
                                                        `7` = "Other"),
                                ege_taken = EGE_taken,
                                ege_math = as.numeric(EGE_3_TEXT),
                                ege_ru = as.numeric(EGE_10_TEXT),
                                ege_avg = (ege_math + ege_ru)/2,
                                region = recode(region, `63` = "Regional Capital", `74` = "Oblast (minus capital)", `79` = "Oblast (minus capital)", 
                                                `80` = "Oblast (minus capital)", `84` = "Oblast (minus capital)", .default = "Other"),
                                f_work_fed = relatives_work_1_1, m_work_fed = relatives_work_1_2, r_work_fed = relatives_work_1_3,
                                f_work_reg = relatives_work_2_1, m_work_reg = relatives_work_2_2, r_work_reg = relatives_work_2_3,
                                f_work_gov = relatives_work_3_1, m_work_gov = relatives_work_3_2, r_work_gov = relatives_work_3_3,
                                f_work_corp = relatives_work_4_1, m_work_corp = relatives_work_4_2, r_work_corp = relatives_work_4_3,
                                f_work_sme = relatives_work_5_1, m_work_sme = relatives_work_5_2, r_work_sme = relatives_work_5_3,
                                f_work_owner = relatives_work_6_1, m_work_owner = relatives_work_6_2, r_work_owner = relatives_work_6_3,
                                f_work_finance = relatives_work_7_1, m_work_finance = relatives_work_7_2, r_work_finance = relatives_work_7_3,
                                f_work_ngo = relatives_work_8_1, m_work_ngo = relatives_work_8_2, r_work_ngo = relatives_work_8_3,
                                f_work_law = relatives_work_9_1, m_work_law = relatives_work_9_2, r_work_law = relatives_work_9_3,
                                f_work_mil = relatives_work_10_1, m_work_mil = relatives_work_10_2, r_work_mil = relatives_work_10_3,
                                f_work_other = relatives_work_11_1, m_work_other = relatives_work_11_2, r_work_other = relatives_work_11_3,
                                f_no_work = relatives_work_12_1, m_no_work = relatives_work_12_2, r_no_work = relatives_work_12_3,
                                f_work_na = relatives_work_13_1, m_work_na = relatives_work_13_2, r_work_na = relatives_work_13_3,
                                f_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1, 1, 0),
                                m_pubsect = ifelse(m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0),
                                r_pubsect = ifelse(r_work_fed == 1 | r_work_reg == 1 | r_work_gov == 1, 1, 0),
                                rel_pubsect = ifelse(f_pubsect == 1 | m_pubsect == 1 | r_pubsect == 1, 1, 0),
                                #collapsing all MA students together
                                class_yr = recode(class_yr, `6` = 5),
                                #field of study
                                dep = recode_factor(dep, `1` = "IGUP_PubAdmin", `2` = "IGUP_Comm", `3` = "IGUP_Trade", 
                                                    `9` = "Other", `10` = "IGUP_EconSec", `11`= "IGUP_PubPol",
                                                    `12` = "IGUP_Management", `13` = "VShEM_Econ", `14` = "VShEM_Management",
                                                    `15` = "VShEM_Customs", `16` = "VShEM_BusCS", `17` = "VShEM_EconSec",
                                                    `18` = "VShEM_SocPol", `19` = "VShEM_AppliedCS", `20` = "VShEM_Finance",
                                                    `22` = "UGI_Pol", `23` = "UGI_Soc", `24` = "UGI_HR", `26` = "UGI_IR"),
                                dep2 = case_when(
                                          grepl("IGUP", dep) ~ "IGUP",
                                          grepl("VShEM", dep) ~ "VShEM",
                                          grepl("UGI|Other", dep) ~ "Other"),
                                #QUALITY CHECK VARIABLES
                                randomid, soc_sci_knowledge = soc_science_knowledg, device,
                                previous_knowledge = Q119,
                                prev_know_source = Q120,
                                newspapers = Q121,
                                kommersant = ifelse(is.na(Q122_6) == F, 1, 0), 
                                #time to complete
                                duration = round(difftime(end_time, start_time, units=c("auto","secs","mins","hours","days","weeks")), 0),
                                #screeners
                                screener1 = ifelse(is.na(S1_27) == F & is.na(S1_29) == F, 1, 0),
                                screener2 = ifelse(is.na(S2_24) == F, 1, 0),
                                screener = screener1 + screener2
                          )





#converting tibble to dataframe
RegionClean_DF <- as.data.frame(region_dta)

#write.csv(RegionClean_DF, "RegionClean.csv", na = "", row.names = F)


